In this notebook we are going to cover the following practical aspects of data science:
To complete this assignment you need to have a running Anaconda installation with Python 2.7 on your device. If this is not the case, refer back to Week 1. Python package prerequisites include:
An additional requirement if you would like to use a database is MongoDB:
MongoDB stuff
If MongoDB is installed on your device and a database named Twitter is created, the tweets can be stored as database entries using the following code:
Note: If Mongo is not installed on your device it will yield a Connection Refused exception.
In [1]:
from pymongo import MongoClient
client = MongoClient()
db = client.Twitter
#db = client.tweets_sample
In [2]:
from pprint import pprint
In [3]:
import pandas as pd
import geopandas as gpd
In [4]:
import numpy as np
import matplotlib.pyplot as plt
In [5]:
import time
In [6]:
start_time = time.time()
#we are filtering out tweets of different languages and outside of the US
filter_query = {
"$and":[ {"place.country_code":"US"}, { "lang": "en" } ]
}
#we are keeping only our fields of interest
columns_query = {
'text':1,
'entities.hashtags':1,
'entities.user_mentions':1,
'place.full_name':1,
'place.bounding_box':1
}
tweets = pd.DataFrame(list(db.tweets.find(
filter_query,
columns_query
)#.limit()
)
)
elapsed_time = time.time() - start_time
print elapsed_time
In [7]:
tweets.drop(['_id'],axis=1,inplace=True)
In [8]:
tweets.head()
Out[8]:
In [9]:
print len(tweets)
Extract data we need into their own columns (links, mentions, hashtags)
In [10]:
import re
# A function that extracts the hyperlinks from the tweet's content.
def extract_link(text):
regex = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
match = re.search(regex, text)
if match:
return match.group()
return ''
# A function that checks whether a word is included in the tweet's content
def word_in_text(word, text):
word = word.lower()
text = text.lower()
match = re.search(word, text)
if match:
return True
return False
In [11]:
tweets['link'] = tweets['text'].apply(lambda tweet: extract_link(tweet))
In [12]:
#remove links
tweets['text'] = tweets['text'].apply(lambda tweet: re.sub(r"http\S+", "", tweet))
In [13]:
#Functions to extract hashtags and mentions from entities
def extract_hashtags(ent):
a=[]
[a.append(hasht['text'].lower()) for hasht in ent['hashtags']]
#[a.append(hasht['text']) for hasht in ent['hashtags']]
return a
def extract_mentions(ent):
users=[]
[users.append(usr_ment['screen_name'].lower()) for usr_ment in ent['user_mentions']]
#[users.append(usr_ment['screen_name']) for usr_ment in ent['user_mentions']]
return users
In [14]:
tweets['hashtags'] = map(extract_hashtags,tweets['entities'])
tweets['mentions'] = map(extract_mentions,tweets['entities'])
tweets.drop(['entities'],axis=1,inplace=True)
In [15]:
tweets['state'] = map(lambda place_dict: place_dict['full_name'][-2:] ,tweets['place'])
tweets['geography'] = map(lambda place_dict: place_dict['bounding_box'] ,tweets['place'])
tweets.drop(['place'],axis=1,inplace=True)
In [16]:
#make all text lowercase
tweets['text'] = tweets.text.apply(lambda x: x.lower())
In [17]:
tweets.columns
Out[17]:
In [ ]:
In [18]:
#SentiStrength
import subprocess
jar_path = "/home/antonis/sentistrength/SentiStrength.jar"
senti_data_path = "/home/antonis/sentistrength/SentiData/"
In [19]:
#define a SentiStrength function which takes a string or tokenized input and returns sentiment scores
## TODO : caution: may be slow! (using ''.join )
sample_text = 'this is something'
def SentiStrength(sample_text):
'''Returns a list of [positive, negative] values'''
if type(sample_text) is str:
return subprocess.check_output(['java', '-jar', jar_path, 'sentidata', senti_data_path ,'text',sample_text]).split()
else:
return subprocess.check_output(['java', '-jar', jar_path, 'sentidata', senti_data_path ,'text','+'.join(sample_text)]).split()
In [ ]:
In [21]:
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import random
def stemmed_words(doc):
return (stemmer.stem(w) for w in analyzer(doc))
stemmer = SnowballStemmer('english')
analyzer = CountVectorizer().build_analyzer()
In [22]:
#import the model
from sklearn.externals import joblib
import pickle
clf = joblib.load('trained models/NaiveBayes67k_chi2descr9-26_22,52.pkl') #the model itself
selector = joblib.load("trained models/selector_chi267k_chi2descr9-26_22,52.pkl") #chi2 feature selector
vect = joblib.load('trained models/vectdescr9-26_22,52.pkl') #Countvectorizer along with its vocabulary
In [23]:
from nltk import tokenize
In [ ]:
In [24]:
#rtransform our
start_time = time.time()
results = clf.predict(selector.transform(vect.transform(tweets['text'])))
elapsed_time = time.time() - start_time
print elapsed_time
In [25]:
#try with probability!
start_time = time.time()
results_prob = clf.predict_proba(selector.transform(vect.transform(tweets['text'])))
elapsed_time = time.time() - start_time
print elapsed_time
In [ ]:
In [26]:
results = pd.Series(results)
results_prob = pd.DataFrame(results_prob,columns=['negative','positive'])
In [27]:
tweets['NB'] = pd.Series(results)
tweets['NB_prob+'] = pd.Series(results_prob['positive'])
In [28]:
tweets.head()
Out[28]:
In [35]:
# manually inspect some results
for i in range(1,5):
j = random.randint(0,len(tweets))
tw = tweets['text'][j]
print tw, '\n',tweets.ix[j,'NB_prob+'],' positive'
print ''
In [37]:
#check the distribution of our probability estimates of the tweets
tweets['NB_prob+'].hist()
plt.show()
In [39]:
sa_results = map(lambda x: 'positive' if x>0.7 else 'negative' if x<0.3 else 'neutral' , tweets['NB_prob+'])
In [40]:
sa_results = pd.Series(sa_results)
In [41]:
sa_results.value_counts().plot(kind='bar', title='# of tweets flagged in each sentiment class / at the 30-70% threshold')
plt.show()
In [42]:
tweets.head()
Out[42]:
In [ ]:
In [43]:
def trump_in_text(tweet):
'''This function takes the text of a tweet and
returns true if there is a mention to Donald Trump or false
on the other hand.'''
if ('donald' in tweet.lower()) or ('trump' in tweet.lower()):
return True
return False
def clinton_in_text(tweet):
'''This function takes the text of a tweet and
returns true if there is a mention to Hillary Clinton or false
on the other hand.'''
if ('hillary' in tweet.lower()) or ('clinton' in tweet.lower()):
return True
return False
def categorize(tr,hil):
'''This function categorizes each tweet based
on text '''
if tr==hil:
return 'irrelevant'
elif tr:
return 'Trump'
else:
return'Clinton'
In [45]:
tweets['Trump'] = tweets['text'].apply(lambda tweet: trump_in_text(tweet))
tweets['Clinton'] = tweets['text'].apply(lambda tweet: clinton_in_text(tweet))
tweets['Politician']=map(lambda tr_col, hil_col: categorize(tr_col, hil_col), tweets['Trump'],tweets['Clinton'])
In [46]:
tweets.drop(['Trump','Clinton','geography'],axis=1,inplace=True)
In [47]:
tweets.head()
Out[47]:
In [48]:
tweets.Politician.unique()
Out[48]:
In [49]:
#### give a label to each tweet (ex. pro-Trump / anti-Hillary etc)
def label_tweet(pol,sent, upper_threshold=0.5):
'''Label tweet depending on politician and sentiment.
Return neutral if politician unknown or Naive Bayes prob close to 0.5
otherwise return Politician initials and +/-'''
if (((sent<upper_threshold) and (sent>(1-upper_threshold))) or (pol=='irrelevant')):
return 'N'
if (pol=='Trump'):
label='T'
if (pol=='Clinton'):
label='C'
if (sent>0.5):
return label+'+'
if (sent<0.5):
return label+'-'
return 'error'
In [ ]:
In [50]:
tweets['label'] = map(lambda name,sent: label_tweet(name,sent) ,tweets['Politician'],tweets['NB_prob+'])
In [51]:
tweets.head()
Out[51]:
In [52]:
from data.US_states import states
In [53]:
#initialize a df indexed by label values
state_sentiment = pd.DataFrame(index=tweets.label.unique())
In [54]:
for state in states.keys():
state_sentiment[state] = tweets[tweets['state']==state]['label'].value_counts()
state_sentiment = state_sentiment.transpose()
In [55]:
state_sentiment.describe()
Out[55]:
In [56]:
state_sentiment.head()
Out[56]:
In [155]:
pickle.dump(state_sentiment,open('results/state_sentiment_0.5.pickle','wb'))
In [ ]:
In [93]:
def get_sentiment_from_hashtag(hashtag_list,
anti_trump_list=set(['nevertrump','dumptrump']), anti_hillary_list=set(['lockherup']),
pro_trump_list=set(['trumptrain']), pro_hillary_list=set(['imwithher'])
):
'''Given a list of hashtags, classify the tweet positively (1), neutral (0) or negatively(-1)'''
hashtag_list = set(hashtag_list)
pro_trump=anti_trump=pro_hillary=anti_hillary = False
negative=positive = False
if (len(hashtag_list.intersection(anti_trump_list))>0) | (len(hashtag_list.intersection(anti_hillary_list))>0):
negative = True
if (len(hashtag_list.intersection(pro_hillary_list))>0) | (len(hashtag_list.intersection(pro_trump_list))>0):
positive = True
if positive == negative:
return 0 #both negative+positive hashtags
if positive:
return 1
if negative:
return -1
return 0 #no 'explanatory' hashtag found
In [97]:
#test our function
print get_sentiment_from_hashtag(['lockherup', 'trumptrain'])
print get_sentiment_from_hashtag(['lockherup', 'dumptrump'])
In [102]:
np.unique([tweets.hashtags.apply(lambda x: get_sentiment_from_hashtag(x))])
Out[102]:
In [106]:
tweets['sentiment_hashtag'] = tweets.hashtags.apply(lambda x: get_sentiment_from_hashtag(x))
In [107]:
tweets.head()
Out[107]:
In [116]:
negative_tweets = clf.predict(selector.transform(vect.transform(tweets.text[tweets.sentiment_hashtag==-1])))
negative_idx = tweets.text[tweets.sentiment_hashtag==-1].index
In [117]:
for i in range(1,5):
j = random.randint(0,len(negative_idx))
print tweets.ix[j,'NB_prob+'],tweets.text[j],'\n'
In [118]:
pd.Series(negative_tweets).value_counts()
Out[118]: